tidycensus, load in variables on median household income and median housing value from the acs5 2013-2017.library(tidycensus)
library(tidyverse)
library(viridis)
library(plyr)
library(gtools)
census_api_key("8eab9b16f44cb26460ecbde164482194b7052772")
Var<-c("B19013_001","B25077_001")
## c(Median household Income, Median Housing Value)
CenDF <- get_acs(geography = "county",
variables = Var,
year = 2017,
survey = "acs5",
geometry = TRUE,
shift_geo = TRUE)
CenDF<-CenDF %>%
mutate(variable=case_when(
variable=="B19013_001" ~ "HHIncome",
variable=="B25077_001" ~ "HouseValue")) %>%
select(-moe) %>%
spread(variable, estimate) %>% #Spread moves rows into columns
mutate(HHInc_HousePrice_Ratio=round(HouseValue/HHIncome,2))
scale_fill_viridis to create natural breaks for color sequencing and to mapHouse-Price-to-Income RatioPlot1<- ggplot(CenDF) +
geom_sf(aes(fill = HHInc_HousePrice_Ratio), color=NA) +
coord_sf(datum=NA) +
labs(title = "House-Price-to-Income Ratio \n R Built-in Color Cut",
caption = "Source: ACS 5-year, 2013-2017",
fill = "Price-Income Ratio") +
scale_fill_viridis(direction=-1)
Plot1
R has a variety of functions that allow the user to create color scales easily. These functions generally require you to specify a color on each end of the spectrum and they will interpolate the values between based upon how many levels you desire.
We select colors to communicate information about our data. If we are using a continuous variable the most basic decision is whether we want to represent it as positive and negative deviations from the average (a divergent scale), or as a continuum of low to high values (a sequential scale). If we have categorical data, it is generally visualized through different colors representing each group (a qualitative scale).
plot( 1:7, rep(1,7), ylim=c(-0.5,3.5), xlim=c(0,12), yaxt="n", xaxt="n", bty="n", xlab="", ylab="" )
color.function <- colorRampPalette( c("gray80","darkred") )
col.ramp <- color.function( 7 ) # number of groups you desire
points( 1:7, rep(3,7), pch=15, cex=8, col=col.ramp )
color.function <- colorRampPalette( c("darkred","gray80","steelblue") )
col.ramp <- color.function( 7 ) # number of groups you desire
points( 1:7, rep(2,7), pch=15, cex=8, col=col.ramp )
color.function <- colorRampPalette( c("gray80","black") )
col.ramp <- color.function( 7 ) # number of groups you desire
points( 1:7, rep(1,7), pch=15, cex=8, col=col.ramp )
text( 8, 3, "Sequential", pos=4 )
text( 8, 2, "Divergent", pos=4 )
text( 8, 1, "Grayscale", pos=4 )
HHInc_HousePrice_Ratio from continuous variable to factor variable with 2 levels## Convert Integer into factor variable
CenDF$fill_factor <- quantcut(CenDF$HHInc_HousePrice_Ratio, q = seq(0, 1, by = 0.5))
CenDF$fill_factor = mapvalues(CenDF$fill_factor, from = levels(CenDF$fill_factor),
to = c("low","high"))
## Assign 2 colors
col.ramp <- viridis(n = 2) # number of groups you desire
Plot2<-ggplot(CenDF) +
geom_sf(aes(fill = fill_factor), color=NA) +
coord_sf(datum=NA) +
labs(title = "House-Price-to-Income Ratio",
caption = "Source: ACS 5-year, 2013-2017",
fill = "Price-Income Ratio") +
scale_fill_manual("Price-Income Ratio",values = col.ramp)
Plot2
HHInc_HousePrice_Ratio from continuous variable to factor variable with 3 levels## Convert Integer into factor variable
CenDF$fill_factor <- quantcut(CenDF$HHInc_HousePrice_Ratio, q = seq(0, 1, by = .33))
CenDF$fill_factor = mapvalues(CenDF$fill_factor, from = levels(CenDF$fill_factor),
to = c("low","med", "high"))
## Assign 3 colors
col.ramp <- viridis(n = 3) # number of groups you desire
Plot3<-ggplot(CenDF) +
geom_sf(aes(fill = fill_factor), color=NA) +
coord_sf(datum=NA) +
labs(title = "House-Price-to-Income Ratio",
caption = "Source: ACS 5-year, 2013-2017",
fill = "Price-Income Ratio") +
scale_fill_manual("Price-Income Ratio \n (Terciles)",values = col.ramp)
Plot3
HHInc_HousePrice_Ratio from continuous variable to factor variable with 5 levels## Convert Integer into factor variable
CenDF$fill_factor <- quantcut(CenDF$HHInc_HousePrice_Ratio, q = seq(0, 1, by = .2))
CenDF$fill_factor = mapvalues(CenDF$fill_factor, from = levels(CenDF$fill_factor),
to = c("1","2","3", "4","5"))
## Assign 5 colors
col.ramp <- viridis(n = 5) # number of groups you desire
Plot4<-ggplot(CenDF) +
geom_sf(aes(fill = fill_factor), color=NA) +
coord_sf(datum=NA) +
labs(title = "House-Price-to-Income Ratio",
caption = "Source: ACS 5-year, 2013-2017",
fill = "Price-Income Ratio") +
scale_fill_manual("Price-Income Ratio \n (Quintiles)",values = col.ramp)
Plot4
HHInc_HousePrice_Ratio from continuous variable to factor variable with 6 levels## Convert Integer into factor variable
CenDF$fill_factor <- quantcut(CenDF$HHInc_HousePrice_Ratio, q = c(0,.1,.25,.5,.75,.9,1))
#CenDF$fill_factor = mapvalues(CenDF$fill_factor, from = levels(CenDF$fill_factor),
# to = c("< .1",".1-.25",".25-.5", ".5-.75",".75-.9","> .9"))
## Assign 6 colors
col.ramp <- viridis(n = 6) # number of groups you desire
Plot5<-ggplot(CenDF) +
geom_sf(aes(fill = fill_factor), color=NA) +
coord_sf(datum=NA) +
labs(title = "House-Price-to-Income Ratio",
caption = "Source: ACS 5-year, 2013-2017",
fill = "Price-Income Ratio") +
scale_fill_manual("Price-Income Ratio",values = col.ramp)
Plot5
library(gridExtra)
grid.arrange(Plot1,Plot3,Plot4,Plot5, nrow=2)
In get_acs function, change year from 2017 to 2012, and call new DF as CenDF2012.
Then create the ratio of household price divided by median household income for the 2008-2012. Call the variable name HHInc_HousePrice_Ratio2012
library(tidycensus)
library(tidyverse)
library(viridis)
library(plyr)
library(gtools)
census_api_key("8eab9b16f44cb26460ecbde164482194b7052772")
Var<-c("B19013_001","B25077_001")
# Download 2008-2012 df
CenDF2012 <- get_acs(geography = "county",
variables = Var,
year = 2012,
survey = "acs5",
geometry = FALSE)
#Create new variable for the housing price to income ratio.
CenDF2012<-CenDF2012 %>%
mutate(variable=case_when(
variable=="B19013_001" ~ "HHIncome2012",
variable=="B25077_001" ~ "HouseValue2012")) %>%
select(-moe,-NAME) %>%
spread(variable, estimate) %>% #Spread moves rows into columns
mutate(HHInc_HousePrice_Ratio2012=round(HouseValue2012/HHIncome2012,2)) Hint: you can not merge to sf DFs. Go back and turn Geometry=FALSE for one of your 2012 DF
CenDF<-merge(CenDF,CenDF2012,by.all="GEOID", all.x=TRUE) # all.x=TRUE makes sure that the merged dataframe keeps all counties in CenDF even if missing in CenDF2012Look at summary statistics and plot histograms for the 2008-2012 housing price to income ratio versus the 2013-2017 housing price to income ratio.
Hist1<-ggplot(CenDF, aes(HHInc_HousePrice_Ratio)) +
geom_histogram(fill = "firebrick2",
color = "white", bins = 60) +
xlab("house price to income ratio by county, 2017") +
theme(plot.title = element_text(hjust = 0.5)) +
geom_vline(xintercept = mean(CenDF$HHInc_HousePrice_Ratio,na.rm=TRUE), lty = "dashed")
Hist2<-ggplot(CenDF, aes(HHInc_HousePrice_Ratio2012)) +
geom_histogram(fill = "firebrick2",
color = "white", bins = 60) +
xlab("house price to income ratio by county, 2012") +
theme(plot.title = element_text(hjust = 0.5)) +
geom_vline(xintercept = mean(CenDF$HHInc_HousePrice_Ratio2012,na.rm=TRUE), lty = "dashed")
library(gridExtra)
grid.arrange(Hist1,Hist2, nrow=2)
summary(CenDF$HHInc_HousePrice_Ratio2012)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.900 2.090 2.570 2.801 3.215 12.440 3
summary(CenDF$HHInc_HousePrice_Ratio)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.670 2.130 2.535 2.758 3.112 11.820 2Hint: To calculate percentage change of a variable use the following equation: pct_change = 100 * (Present - Past) / Past. Use mutate and replace present and past with the 2017 and 2012 house price to income ratios, respectively.
```r
## Change Variable
CenDF<-CenDF %>%
mutate(pct_change = 100 * (`HHInc_HousePrice_Ratio` - `HHInc_HousePrice_Ratio2012`) / `HHInc_HousePrice_Ratio2012`)
```
Hint: You will make minor changes to the code you used to create Plot 1 above. e.g. change the fill variable in ggplot to the name of the change variable you created in the previous step.
```r
upper_limit <- round(max(CenDF$pct_change,na.rm=TRUE) + 10, -1)
lower_limit <- round(min(CenDF$pct_change,na.rm=TRUE) - 10, -1)
Plot6<- ggplot(CenDF,aes(fill = pct_change)) +
geom_sf(size = 0) +
#geom_sf(data = major_roads_geo, color = "white", size = 0.8, fill = NA) +
#geom_sf(data = minor_roads_geo, color = "white", size = 0.4, fill = NA) +
scale_fill_viridis(option = "viridis", name = "% Change", limits = c(lower_limit, upper_limit), breaks = seq(lower_limit, upper_limit, 20)) +
labs(title="Changes in House Price to Income Ratio",
subtitle = "2017 5-Year Estimates vs. 2012 5-Year Estimates for Census Tracts",
caption = paste0(
"Data sources:",
"\n U.S. Census Bureau, 2012 and 2017 American Community Survey 5-Year Estimates"
)
) +
theme(plot.caption = element_text(hjust = 0, margin = margin(t = 15))) +
theme(axis.ticks = element_blank(), axis.text = element_blank()) +
theme(panel.background = element_blank())
```
In part 1, you transformed HHInc_HousePrice_Ratio from continuous variable to factor variable with 4 levels (i.e. quartiles), and created new cloropleth maps. Looking at grid.arrange to compare your new cloropleth maps (i.e. Plot 1, Plot3, and Plot 5), answer the following questions:
1a. Which map do you think tells the most (and least) accurate representation of the data?
Answer: I like Plot 5 the best that separates house price to income into 6 bins. This plot gives more variation for the west coast, and also the legend shows actual values of each grouping rather than a quartile or quantile as in the other plots.
1b. Do you think its best to let R automatically cut the varyiable for you (i.e. Plot 1) or better to adjust the variable cut yourself?
Answer: Cutting a variable and assigning unique colors myself offers more flexibility and more accurately tells an accurate story of the underlying data.
In Step 3 of Part 2, what do you notice when looking at the descriptive statistics for the house price to income ratio during the two different time periods (i.e. is there an average increase, decrease or no change over time)? What is the range of the change variable and interpet the meaning?
Answer: There is a very small reduction in average house price to income ratio from 2008-2012 (mean=2.801) to 2013-2017 (mean=2.758). The change variable ranges from -52.1 to 127.9, meaning that the house price to income ratio over the time period decreased at most by 52% and increased at most by 128%. The average change was an increase of .018%, indicating on average house price to income ratio remained largely constant over the short time period considered here.
In Step 5 of Part 2, create at least 2 new cloropleth maps based on alternative cut values for the change in house price to income ratio. Comment on what you did and what map provides the most accurate depiction of the underlying data.
Use grid.arrange to compare the various cloropleth maps side by side and comment on which map offers the most accurate interpration of the data.
hint: Follow a similar procedure as in Part 1 to look at terciles, quartiles, quantiles, etc.). You will need to remove the scale_fill_viridis in plot 6 with scale_fill_manual as in Plot 5.
Answer: I broke the percentage change variable into quartiles and into 6 groups, similar to step 1. I was not happy with either map, however, in part because to much variation in the colors made it hard to discern a pattern. For a third alternative, I classified the percentage change variable into only 2 groups – positive % change (i.e. less affordable) and negative % change (i.e. more affordable). I like this map the best as it easily shows which counties in US became more and less affordable.
# Alternative Plot 1
## Convert Integer into factor variable
CenDF$fill_factor1 <- quantcut(CenDF$pct_change, q = c(0,.25,.5,.75,1))
## Assign 6 colors
col.ramp <- viridis(n = 4) # number of groups you desire
Plot7<- ggplot(CenDF,aes(fill = fill_factor1)) +
geom_sf(size = 0) +
#geom_sf(data = major_roads_geo, color = "white", size = 0.8, fill = NA) +
#geom_sf(data = minor_roads_geo, color = "white", size = 0.4, fill = NA) +
#scale_fill_viridis(option = "viridis", name = "% Change", limits = c(lower_limit, upper_limit), breaks = seq(lower_limit, upper_limit, 20)) +
scale_fill_manual("% Change \n Quartiles",values = col.ramp) +
theme(plot.caption = element_text(hjust = 0, margin = margin(t = 15))) +
theme(axis.ticks = element_blank(), axis.text = element_blank()) +
theme(panel.background = element_blank())
## Alternative Plot 2
## Convert Integer into factor variable
CenDF$fill_factor2 <- quantcut(CenDF$pct_change, q = c(0,.1,.25,.5,.75,.9,1))
## Assign 6 colors
col.ramp <- viridis(n = 6) # number of groups you desire
Plot8<- ggplot(CenDF,aes(fill = fill_factor2)) +
geom_sf(size = 0) +
#geom_sf(data = major_roads_geo, color = "white", size = 0.8, fill = NA) +
#geom_sf(data = minor_roads_geo, color = "white", size = 0.4, fill = NA) +
#scale_fill_viridis(option = "viridis", name = "% Change", limits = c(lower_limit, upper_limit), breaks = seq(lower_limit, upper_limit, 20)) +
scale_fill_manual("% Change",values = col.ramp) +
theme(plot.caption = element_text(hjust = 0, margin = margin(t = 15))) +
theme(axis.ticks = element_blank(), axis.text = element_blank()) +
theme(panel.background = element_blank())
## Alternative Plot 3
## Convert Integer into factor variable
CenDF$fill_factor3 <- ifelse(CenDF$pct_change>0,"More Affordable","Less Affordable")
## Assign 6 colors
col.ramp <- viridis(n = 2) # number of groups you desire
Plot9<- ggplot(CenDF,aes(fill = fill_factor3)) +
geom_sf(size = 0) +
#geom_sf(data = major_roads_geo, color = "white", size = 0.8, fill = NA) +
#geom_sf(data = minor_roads_geo, color = "white", size = 0.4, fill = NA) +
#scale_fill_viridis(option = "viridis", name = "% Change", limits = c(lower_limit, upper_limit), breaks = seq(lower_limit, upper_limit, 20)) +
scale_fill_manual("% Change",values = col.ramp) +
theme(plot.caption = element_text(hjust = 0, margin = margin(t = 15))) +
theme(axis.ticks = element_blank(), axis.text = element_blank()) +
theme(panel.background = element_blank())#Edit me
#Answer
#Visualize Plots
grid.arrange(Plot6,Plot7,Plot8,Plot9, nrow=2)